-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_MA
> TH_NOT_NO/AFQT_MATCHING_MATH_NOT_NO_with_weights.log
  log type:  text
 opened on:   9 May 2014, 12:11:41

. 
. /***************************************************
> MATCHING AFQT SCORES ACROSS NLSY 1979 and NLSY 1997
> 
> This do file creates a data file with comparable AFQT scores across both NLSY
> 79 and NLSY97.
> There are two main steps in creating the comparable AFQT scores:
> 
> 1. The 1979 ASVAB is a Paper and Pencil (P&P) test, while the 1997 ASVAB was 
> computer adminstered. 
> To make the scores comparable across cohorts, we rely on a percentile mapping
>  provided by Dan Segall (Segall (1997))
> 
> 2. The age at which respondents took the test differs between 1979 and 1997. 
> The 1997 sample is much younger.
> For both samples, we observe a large sample of individuals taking the test at
>  age 16. We use this overlap in the 
> test-taking age by mapping all test scores within cohorts into the age 16-dis
> tribution based on the within age 
> ranking of test scores. 
> 
> For details, see Segall (1997) and Altonji, Bharadwaj & Lange (2009).
> 
> Altonji, J., Bharadwaj, P. & Lange, F. "Changes in the Characteristics of Ame
> rican Youth - 
> Implications for Adult Outcomes" NBER Working Papers No. 13883, revised 2009.
> Segall, D. O. (1997). "Equating the CAT-ASVAB". In W. A. Sands, B. K. Waters,
>  & J. R. McBride (Eds.), 
>         Computerized adaptive testing: From inquiry to operation (pp. 181-198
> ). Washington, DC: American Psychological Association. 
> Date: August 19, 2009.
> ******************************************************/
. 
. // Set path to directory containing afqt1997a.csv //
. capture cd "/afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_M
> ATH_NOT_NO"

. * cd "/Users/JKukkur/Documents/Research/ABL/AFQT MATCHING"
. 
. tempfile afqt97 afqt_append nlsy_agestd agestd_afqt missings finished_product

. 
. /****************************************************************
> First Step of score conversion: 
> Transfrom CAT Test Scores from NLSY97 into Paper and Pencil Test Scores using
>  mapping provided by Dan Segall.
> Combine this data with raw data from NLSY79.
> *****************************************************************/
. 
. 
. // afqt1997a.csv contains individual id's and sex from NLSY1997 and the ASVAB
>  component scores provided by Dan Segall. 
. // The are P&P equivalent scores based on the mapping procedure described in 
> Segall (1996). 
. // Dan Segall suuplied us with these P&P equivalent scores using ASVAB compon
> ent scores contained in (DICTIONARY FILE).
. insheet using afqt1997a.csv, comma
(36 vars, 8984 obs)

. ren v1 pid

. ren v2 male

. egen asvabMathNotNO=rowmean(mk ar) if mk!=0 & ar!=0 // mk=mathematical knowle
> dge, ar=arithmetic reasoning
(1891 missing values generated)

. keep pid male asvabMathNotNO

. sort pid

. save `afqt97', replace
(note: file /tmp/St05447.000004 not found)
file /tmp/St05447.000004 saved

. 
. // Merge age in for the NLSY97 sample //
. infile using age97.dct, clear

infile dictionary {
  R0000100 "PUBID - YTH ID CODE 1997"
  R1194100 "CV_AGE_INT_DATE 1997"
  R2553500 "CV_AGE_INT_DATE 1998"
  R3876300 "CV_AGE_INT_DATE 1999"
  R5453700 "CV_AGE_INT_DATE 2000"
  R7216000 "CV_AGE_INT_DATE 2001"
  S1531400 "CV_AGE_INT_DATE 2002"
  S2001000 "CV_AGE_INT_DATE 2003"
  S3801100 "CV_AGE_INT_DATE 2004"
}

(8984 observations read)

. ren R0000100    pid

. ren R1194100    age  // age as of 1997 (test-taking year for NLSY97) //

. keep pid age

. sort pid

. merge pid using `afqt97' 
(note: you are using old merge syntax; see [D] merge for new syntax)

. drop _merge 

. sort pid

. save `afqt97', replace
file /tmp/St05447.000004 saved

. 
. // Merge in weights for 1997 data //
. // We use the custom weight provided by the NLSY for the year 1997, the year 
> when the ASVAB was administered. //
. insheet using weights97.csv, clear
(2 vars, 8984 obs)

. sort pid

. merge pid using `afqt97'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge 

. gen sample=1                            // Sample Identifier: 1= 1997 NLSY sa
> mple, 0=1979 NLSY sample //

. sort sample pid

. save `afqt97', replace
file /tmp/St05447.000004 saved

. 
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. infile using asvabMath.dct, clear

infile dictionary {
  R0000100 "ID# (1-12686) 79"
  R0000500 "DATE OF BIRTH - YR 79"
  R0173600 "SAMPLE ID  79 INT"
  R0406510 "AGE OF R @ INT DATE 80"
  R0618011 "PROFILES ASVAB SEC 2-STD SCRNR 81"
  R0618014 "PROFILES ASVAB SEC 5-STD SCRNR 81"
  R0618017 "PROFILES ASVAB SEC 8-STD SCRNR 81"
}

(12686 observations read)

. ren R0000100 pid                

. ren R0406510 age                                                             
>            // Age as of 1980 (test taking year for NLSY79) //

. ren R0000500 birthyear 

. ren R0173600 sampid

. ren R0618011 ar

. ren R0618017 mk

. qui replace ar=. if ar<0

. qui replace mk=. if mk<0

. egen asvabMathNotNO = rowmean(mk ar) if mk!=0 & ar!=0
(808 missing values generated)

. drop if asvabMathNotNO==.
(808 observations deleted)

. label var asvabMathNotNO "Math subset of ASVAB score"

. replace age=80-birthyear if age<0 & birthyear!=.        // Fill missing age u
> sing birth-year // 
(202 real changes made)

. drop birthyear

. gen sample=0                                                    // Sample Ide
> ntifier: 1= 1997 NLSY sample, 0=1979 NLSY sample //

. append using `afqt97'                           // Append the data-set for NL
> SY97 //

. sort sample pid

. save `afqt_append', replace
(note: file /tmp/St05447.000005 not found)
file /tmp/St05447.000005 saved

. 
. * Merge in 1979 weights
. // We use the custom weight provided by the NLSY for the year 1979, the year 
> when the ASVAB was administered. //
. 
. insheet using weights79.csv, clear
(2 vars, 12686 obs)

. gen sample=0

. sort sample pid

. merge sample pid using `afqt_append'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge

. 
. * The weights have implied 2 decimal places.
. replace weight=weight/100
weight was long now double
(21670 real changes made)

. sort sample pid

. save `afqt_append', replace
file /tmp/St05447.000005 saved

. 
. /***************************************************************************
> Second Step: Percentile mapping of P&P test scores into age=16 distribution
> ****************************************************************************/
. 
. *****           GENERATING PERCENTILES OF SCORES BY AGE AND NLSY-SAMPLE *****
> ****
. use `afqt_append', clear

. drop ar mk

. 
. // Drop those with missing AFQT scores
. drop if asvabMathNotNO==.
(2699 observations deleted)

. 
. // For Table I in Introduction.doc
. bysort sample: tab age

-------------------------------------------------------------------------------
-> sample = 0

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         15 |        962        8.10        8.10
         16 |      1,511       12.72       20.82
         17 |      1,488       12.53       33.35
         18 |      1,432       12.06       45.40
         19 |      1,502       12.65       58.05
         20 |      1,558       13.12       71.17
         21 |      1,539       12.96       84.12
         22 |      1,529       12.87       96.99
         23 |        357        3.01      100.00
------------+-----------------------------------
      Total |     11,878      100.00

-------------------------------------------------------------------------------
-> sample = 1

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         12 |        960       13.53       13.53
         13 |      1,406       19.82       33.36
         14 |      1,479       20.85       54.21
         15 |      1,491       21.02       75.23
         16 |      1,324       18.67       93.90
         17 |        430        6.06       99.96
         18 |          3        0.04      100.00
------------+-----------------------------------
      Total |      7,093      100.00


. 
. 
. // For Figure 1 in NSLY79
. kdensity asvabMathNotNO if sample==0&age==16, addplot(kdensity asvabMathNotNO
>  if sample==1&age==16, lpattern(_)) title(Figure 1: AFQT Scores at Age 16) //
> /
> note("The NLSY79-scores are the P&P scores reported by the NLSY79." "The NLSY
> -97 scores are based on the CAT scores from NLSY97 and the equation by Segal 
> (1997)." "Both populations are weighted to be population representative.") //
> /
> legend(label(1 "NLSY 1979") label(2 "NLSY 1997") cols(2)) saving(HistScores, 
> replace)
(file HistScores.gph saved)

. 
. 
. // Combine sparsely populated age-groups in both samples with adjacent age-gr
> oups //
. replace age=22 if age==23 & sample==0
(357 real changes made)

. replace age=17 if age==18 & sample==1
(3 real changes made)

. 
. *******************************************************************
. *EXPANDING THE DATA AND CREATING PERCENTILES BY HAND
. // The following procedure improves the quality of the percentile mapping. 
. // The problem is that some observations 'belong' in several percentiles beca
> use they 
. // have large weights. Stata commands such as xtile will simply assign a uniq
> ue percentile to these
. // observations. Instead, we need to account for the fact that these observat
> ions belong to several pctiles. 
. // This is achieved by expanding the data-set proportionally to the weights a
> nd then generating percentiles. 
. 
. *expanding each observation by its weight - an observation with a weight of 1
> 100 is expanded into 11 observations. 
. gen percentile_rank=.
(18971 missing values generated)

. replace weight=round(weight/100)
(18971 real changes made)

. foreach num of numlist 0/160 {
  2.         qui expand `num' if weight==`num'
  3. }

. 
. *generating a unique rank within each sample and age
. bysort sample age: egen r=rank(asvabMathNotNO), u

. gen pasvabMathNotNO=.
(470787 missing values generated)

. 
. * Divide the rank by number of individuals corresponding to the population of
>  a given age and sample
. * to get the percentile of an individual.
. 
. gen holdey = 1/100

. bysort sample age: egen sumWgts=total(holdey)

. 
. *SAMPLE==0
. qui replace pasvabMathNotNO=round(r/sumWgts) if age==15 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==16 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==17 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==18 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==19 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==20 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==21 & sample==0

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==22 & sample==0

. 
. *SAMPLE 1
. qui replace pasvabMathNotNO=round(r/sumWgts) if age==12 & sample==1

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==13 & sample==1

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==14 & sample==1

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==15 & sample==1

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==16 & sample==1

. qui replace pasvabMathNotNO=round(r/sumWgts) if age==17 & sample==1

. 
. *dropping the duplicates now
. egen tag=tag(sample pid)

. keep if tag==1
(451816 observations deleted)

. drop tag

. 
. sort sample pasvabMathNotNO

. save `nlsy_agestd', replace
(note: file /tmp/St05447.000006 not found)
file /tmp/St05447.000006 saved

. 
. *****************************************************************************
. 
. // Within sample, we map AFQT scores by age in the age=16
. // distribution. We therefore require mean AFQT-scores of age=16 by percentil
> e in each sample. 
. // We need to generate these averages using the weights.
. bys sample pasvabMathNotNO: egen pop=sum(weight) if age==16
(16136 missing values generated)

. bys sample pasvabMathNotNO: egen tot_score=sum(asvabMathNotNO*weight) if age=
> =16        // Mean age=16 raw score for each percentile //
(16136 missing values generated)

. bys sample pasvabMathNotNO: gen mean=tot_score/pop if age==16
(16136 missing values generated)

. drop tot_score pop

. egen tag=tag(sample pasvabMathNotNO mean) if age==16                    // We
>  need only 1 obs per sample and percentile //

. keep if tag==1
(18769 observations deleted)

. ren mean asvabMathNotNO_std

. keep sample asvabMathNotNO_std pasvabMathNotNO asvabMathNotNO

. sort sample pasvabMathNotNO

. save `agestd_afqt', replace
(note: file /tmp/St05447.000007 not found)
file /tmp/St05447.000007 saved

. 
. use `nlsy_agestd', clear

. merge sample pasvabMathNotNO using `agestd_afqt' // Merge into each percentil
> e the age=16 corresponding score //
(note: you are using old merge syntax; see [D] merge for new syntax)
variables sample pasvabMathNotNO do not uniquely identify observations in the
    master data

. drop _merge

. keep sample pid male pasvabMathNotNO asvabMathNotNO_std asvabMathNotNO weight
>  age

. sort sample pid

. // The final data contains "asvabMathNotNO_std": which is the comparable scor
> e across the 2 NLSY samples //
. *keep if sample==1 // this drops the NLSY79 observations //
. ren pid ID

. ren weight weight_altonji

. gen year = 1979 if sample==0
(7093 missing values generated)

. replace year = 1997 if sample==1 
(7093 real changes made)

. *drop sample age asvabMathNotNO pasvabMathNotNO // drop the other variables s
> ince I don't use them in my research //
. preserve

.         use `afqt97', clear

.         keep pid male

.         ren pid ID

.         save `missings', replace
(note: file /tmp/St05447.000008 not found)
file /tmp/St05447.000008 saved

. restore

. 
. preserve

.         keep if sample==1
(11878 observations deleted)

.         merge 1:1 ID using `missings'

    Result                           # of obs.
    -----------------------------------------
    not matched                         1,891
        from master                         0  (_merge==1)
        from using                      1,891  (_merge==2)

    matched                             7,093  (_merge==3)
    -----------------------------------------

.         tab _merge

                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
         using only (2) |      1,891       21.05       21.05
            matched (3) |      7,093       78.95      100.00
------------------------+-----------------------------------
                  Total |      8,984      100.00

.         zscore asvabMathNotNO_std
z_asvabMathNotNO_std created with 1891 missing values

.         drop asvabMathNotNO_std

.         ren z_asvabMathNotNO_std asvabMathNotNO_std

.         keep if male==1
(4385 observations deleted)

.         sort ID

.         outsheet ID asvabMathNotNO_std using altonjiAFQT.csv, comma nol repla
> ce

. restore

. save afqt_adjusted_final, replace 
file afqt_adjusted_final.dta saved

. save `finished_product', replace
(note: file /tmp/St05447.000009 not found)
file /tmp/St05447.000009 saved

. 
. tabstat asvabMathNotNO_std, by(sample) stats(n min max mean median)

Summary for variables: asvabMathNotNO_std
     by categories of: sample 

  sample |         N       min       max      mean       p50
---------+--------------------------------------------------
       0 |     11878  31.63655        67  46.07282  43.67105
       1 |      7093  28.71233        67  48.17814        48
---------+--------------------------------------------------
   Total |     18971  28.71233        67  46.85997        45
------------------------------------------------------------

. 
. isid ID year

. 
. * Test: the afqt distributions across age within sample should now be identic
> al. There will still be very small deviations
. * because of the coarseness of the above expansion, but we believe these to b
> e third order. To allow this code to run 
. * rapidly, we tolerate these deviations. 
. 
. **bys sample age: sum asvabMathNotNO_std [fw=weight], d
. 
. use `finished_product', clear

. ** Generate a NLSY79-only supplement:
. drop if sample==1
(7093 observations deleted)

. save afqt_adjusted_final79, replace
file afqt_adjusted_final79.dta saved

. 
. use `finished_product', clear

. ** Generate a NLSY97-only supplement:
. drop if sample==0
(11878 observations deleted)

. save afqt_adjusted_final97, replace
file afqt_adjusted_final97.dta saved

. 
. log close
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_MA
> TH_NOT_NO/AFQT_MATCHING_MATH_NOT_NO_with_weights.log
  log type:  text
 closed on:   9 May 2014, 12:12:11
-------------------------------------------------------------------------------
